data <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", header=TRUE, sep=",")
summary(data)
## M X0.455 X0.365 X0.095
## F:1307 Min. :0.075 Min. :0.0550 Min. :0.0000
## I:1342 1st Qu.:0.450 1st Qu.:0.3500 1st Qu.:0.1150
## M:1527 Median :0.545 Median :0.4250 Median :0.1400
## Mean :0.524 Mean :0.4079 Mean :0.1395
## 3rd Qu.:0.615 3rd Qu.:0.4800 3rd Qu.:0.1650
## Max. :0.815 Max. :0.6500 Max. :1.1300
## X0.514 X0.2245 X0.101 X0.15
## Min. :0.0020 Min. :0.0010 Min. :0.00050 Min. :0.0015
## 1st Qu.:0.4415 1st Qu.:0.1860 1st Qu.:0.09337 1st Qu.:0.1300
## Median :0.7997 Median :0.3360 Median :0.17100 Median :0.2340
## Mean :0.8288 Mean :0.3594 Mean :0.18061 Mean :0.2389
## 3rd Qu.:1.1533 3rd Qu.:0.5020 3rd Qu.:0.25300 3rd Qu.:0.3290
## Max. :2.8255 Max. :1.4880 Max. :0.76000 Max. :1.0050
## X15
## Min. : 1.000
## 1st Qu.: 8.000
## Median : 9.000
## Mean : 9.932
## 3rd Qu.:11.000
## Max. :29.000
colnames(data)
## [1] "M" "X0.455" "X0.365" "X0.095" "X0.514" "X0.2245" "X0.101"
## [8] "X0.15" "X15"
colnames(data) <- c("sex", "length", "diameter", "height",
"whole_weight", "shucked_weight",
"viscera_weight", "shell_weight", "rings")
colnames(data)
## [1] "sex" "length" "diameter" "height"
## [5] "whole_weight" "shucked_weight" "viscera_weight" "shell_weight"
## [9] "rings"
data$sex <- factor(c("Female", "Infant", "Male")[data$sex])
par(mfrow=c(1,3))
hist(data$diameter, main = "Диаметр, мм")
hist(data$height, main = "Высота, мм")
hist(data$whole_weight, main = "Полный вес, гр")
Видим ассиметрию https://en.wikipedia.org/wiki/Skewness и выбросы (от них нужно избавиться)
Визуализация возможных зависимостей
par(mfrow=c(1,2))
plot(data$diameter, data$whole_weight,'p',main = "Зависимость веса от диаметра")
plot(data$height, data$whole_weight,'p',main = "Зависимость веса от высоты")
Исследование увиденой зависимости 1.Построить линейные модели при помощи функции lm, посмотреть их характеристики
Линейная модель зависимости веса от диаметра
diameter_linear_model <- lm(whole_weight~diameter, data)
diameter_linear_model
##
## Call:
## lm(formula = whole_weight ~ diameter, data = data)
##
## Coefficients:
## (Intercept) diameter
## -1.036 4.573
summary(diameter_linear_model)
##
## Call:
## lm(formula = whole_weight ~ diameter, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.56747 -0.12310 -0.03997 0.07211 1.14104
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.03645 0.01216 -85.2 <2e-16 ***
## diameter 4.57295 0.02898 157.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1858 on 4174 degrees of freedom
## Multiple R-squared: 0.8565, Adjusted R-squared: 0.8564
## F-statistic: 2.491e+04 on 1 and 4174 DF, p-value: < 2.2e-16
plot(diameter_linear_model)
Линейная модель веса от высоты
height_linear_model <- lm(whole_weight~height, data)
height_linear_model
##
## Call:
## lm(formula = whole_weight ~ height, data = data)
##
## Coefficients:
## (Intercept) height
## -0.5114 9.6054
summary(height_linear_model)
##
## Call:
## lm(formula = whole_weight ~ height, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7487 -0.1488 -0.0346 0.1151 1.5238
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.51140 0.01516 -33.73 <2e-16 ***
## height 9.60540 0.10408 92.29 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2813 on 4174 degrees of freedom
## Multiple R-squared: 0.6711, Adjusted R-squared: 0.671
## F-statistic: 8517 on 1 and 4174 DF, p-value: < 2.2e-16
plot(height_linear_model)
data_without_emissions <- data[data$diameter>0.1, ]
diameter_linear_model_new <- lm(whole_weight~diameter, data_without_emissions)
diameter_linear_model_new
##
## Call:
## lm(formula = whole_weight ~ diameter, data = data_without_emissions)
##
## Coefficients:
## (Intercept) diameter
## -1.048 4.598
summary(diameter_linear_model_new)
##
## Call:
## lm(formula = whole_weight ~ diameter, data = data_without_emissions)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.56815 -0.12225 -0.03874 0.07345 1.13705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.04765 0.01216 -86.12 <2e-16 ***
## diameter 4.59849 0.02896 158.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1846 on 4169 degrees of freedom
## Multiple R-squared: 0.8581, Adjusted R-squared: 0.8581
## F-statistic: 2.521e+04 on 1 and 4169 DF, p-value: < 2.2e-16
plot(diameter_linear_model_new)
Линейная модель зависимости веса от высоты без выбросов
data_without_emissions <- data[data$height<0.4&data$height>0.05&data$diameter>0.1, ]
dw_linear_model <- lm(whole_weight~+height+diameter, data_without_emissions)
dw_linear_model
##
## Call:
## lm(formula = whole_weight ~ +height + diameter, data = data_without_emissions)
##
## Coefficients:
## (Intercept) height diameter
## -1.120 3.763 3.473
summary(dw_linear_model)
##
## Call:
## lm(formula = whole_weight ~ +height + diameter, data = data_without_emissions)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52231 -0.10868 -0.03049 0.07438 1.01366
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.12005 0.01168 -95.91 <2e-16 ***
## height 3.76302 0.16194 23.24 <2e-16 ***
## diameter 3.47294 0.06292 55.20 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1664 on 4105 degrees of freedom
## Multiple R-squared: 0.8817, Adjusted R-squared: 0.8817
## F-statistic: 1.53e+04 on 2 and 4105 DF, p-value: < 2.2e-16
plot(dw_linear_model)
all_in_one_linear_model <- lm(whole_weight~.-shucked_weight-viscera_weight-shell_weight, data_without_emissions)
all_in_one_linear_model
##
## Call:
## lm(formula = whole_weight ~ . - shucked_weight - viscera_weight -
## shell_weight, data = data_without_emissions)
##
## Coefficients:
## (Intercept) sexInfant sexMale length diameter
## -1.157326 -0.021696 0.015360 1.911435 1.229664
## height rings
## 3.580197 -0.002294
summary(all_in_one_linear_model)
##
## Call:
## lm(formula = whole_weight ~ . - shucked_weight - viscera_weight -
## shell_weight, data = data_without_emissions)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46840 -0.10704 -0.03456 0.06938 1.05602
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.1573263 0.0167308 -69.174 < 2e-16 ***
## sexInfant -0.0216956 0.0075909 -2.858 0.00428 **
## sexMale 0.0153597 0.0061246 2.508 0.01219 *
## length 1.9114347 0.1307500 14.619 < 2e-16 ***
## diameter 1.2296643 0.1636835 7.512 7.08e-14 ***
## height 3.5801973 0.1647054 21.737 < 2e-16 ***
## rings -0.0022938 0.0009993 -2.295 0.02176 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1617 on 4101 degrees of freedom
## Multiple R-squared: 0.8885, Adjusted R-squared: 0.8883
## F-statistic: 5446 on 6 and 4101 DF, p-value: < 2.2e-16
plot(all_in_one_linear_model)
data.noout <- data_without_emissions
odds <- seq(1, nrow(data.noout), by=2)
data.in <- data.noout[odds, ]
data.out <- data.noout[-odds, ]
linear.model.half<-lm(whole_weight~.-shucked_weight-viscera_weight-shell_weight,data.in)
linear.model.half
##
## Call:
## lm(formula = whole_weight ~ . - shucked_weight - viscera_weight -
## shell_weight, data = data.in)
##
## Coefficients:
## (Intercept) sexInfant sexMale length diameter
## -1.158210 -0.024309 0.024274 1.899176 1.165536
## height rings
## 3.812492 -0.001947
summary(linear.model.half)
##
## Call:
## lm(formula = whole_weight ~ . - shucked_weight - viscera_weight -
## shell_weight, data = data.in)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44573 -0.10895 -0.03478 0.07045 1.03577
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.158210 0.023804 -48.656 < 2e-16 ***
## sexInfant -0.024309 0.010828 -2.245 0.02488 *
## sexMale 0.024274 0.008793 2.761 0.00582 **
## length 1.899176 0.180661 10.512 < 2e-16 ***
## diameter 1.165536 0.227570 5.122 3.31e-07 ***
## height 3.812492 0.239939 15.889 < 2e-16 ***
## rings -0.001947 0.001461 -1.332 0.18288
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1639 on 2047 degrees of freedom
## Multiple R-squared: 0.8888, Adjusted R-squared: 0.8885
## F-statistic: 2727 on 6 and 2047 DF, p-value: < 2.2e-16
plot(linear.model.half)
Cпрогнозировать (функция predict) значения во второй части
data.predict <- predict(linear.model.half, data.out)
plot (data.out$whole_weight, data.predict)
Проверка качества прогноза
cor (data.out$whole_weight, data.predict)
## [1] 0.9424124